#!/usr/bin/env python
# -*- coding: utf-8 -*-
# --------------------------------------------------
#Author:    LJ
#Email:     admin@attacker.club

#Date:      2018/4/7
#Description:
# --------------------------------------------------


#detail_page = path + "/" + line.split("/")[-2]
#nternal_url = url_line.split("/")[-2]
#默认取-1 ，当"/"结尾取-2



# --------------------------------------------------



import os, re
import requests
import time

# 导入模块


headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
           "Accept-Encoding": "gzip",
           "Accept-Language": "zh-CN,zh;q=0.8",
           "Referer": "https://www.zmrenwu.com/",
           "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
           }


def geturl(request):
    if request.find('http') >-1:
        #print('外部url')
        res = requests.get(request,headers)
    else:
        #print('内部url')
        res = requests.get(url + request,headers)

    html = res.text

    #https://www.zmrenwu.com/post/2/
    allurls = re.findall(r'href="([^"]+)"', html)




    for url_line in allurls:
        # href="/post/3/"> 文章url
        if url_line.find(content) > -1:
            nternal_url = url_line.split("/")[-2]
            #print('replace:',nternal_url)
            html = html.replace(url_line, nternal_url +".html")

        elif url_line.find("/css/") > -1:
            nternal_url = url_line.split("/")[-1]
            html = html.replace(url_line, nternal_url )
        else:
            continue


    return html


def savehtml(text, path):
    with open(path, 'w') as f:
        f.write(HTML)
        f.close()


if __name__ == "__main__":
    url = "https://www.zmrenwu.com" # 域名
    content = "/post/"  # 内容url
    index = requests.get(url + content+"2")
    #print(index.text)  # 内容


    # 创建url的保存目录
    path = os.path.join(url.split("/")[2])
    if not os.path.exists(path):
        os.mkdir(path)



    all_urls = re.findall(r'href="([^"]+)"', index.text) #抓取所有url


    for line in all_urls:

        if line.find("/css/") > -1 : #匹配css,直接下载
            if line.find('http') > -1: #绝对路径url
                res = requests.get(line, headers)
            else:
                res = requests.get(url + line, headers)

            HTML=res.text #内容
            CSS_page = path + "/" + line.split("/")[-1]
            savehtml(HTML, CSS_page)
            print('读取URL:', line, '保存为:', CSS_page)



        elif line.find(content) > -1 and line.find('http') : #匹配文章,本地化url

            detail_page = path + "/" + line.split("/")[-2] + ".html" #默认-1,url结尾是/的 值为-2

            HTML = geturl(line)
            savehtml(HTML, detail_page)
            print('读取URL:', line, '保存为:', detail_page)

            #time.sleep(1.5)

        else:
            continue

    print("\n\033[31m 下载完毕 !!!\033[0m")



"""
url='xxx'
res = requests.get(url)
html = res.text

"""